Generate Random Walks

# Set the number of time steps
n_steps = 100
# Set the number of random walks
n_walks = 100

# Generate random walks with the same mean and variance
set.seed(123)  # for reproducibility
cov_matrix = diag(n_walks)
cov_matrix[cov_matrix == 0] = 0.6
random_walks = mvrnorm(n = n_steps, mu = rep(0, n_walks), Sigma = cov_matrix)
random_walks = apply(random_walks, 2, cumsum)

intercepts = rnorm(n_walks,0,5)
for (i in 1:ncol(random_walks)){
  random_walks[,i] = random_walks[,i] + intercepts[i]
}

# Create a data frame in wide format
wide_df = data.frame(matrix(random_walks, ncol = n_walks))
colnames(wide_df) = paste0("x", 1:n_walks)
wide_df$t = 1:n_steps
cov_matrix[c(1:10),c(1:10)]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.6  0.6  0.6  0.6  0.6  0.6  0.6  0.6   0.6
##  [2,]  0.6  1.0  0.6  0.6  0.6  0.6  0.6  0.6  0.6   0.6
##  [3,]  0.6  0.6  1.0  0.6  0.6  0.6  0.6  0.6  0.6   0.6
##  [4,]  0.6  0.6  0.6  1.0  0.6  0.6  0.6  0.6  0.6   0.6
##  [5,]  0.6  0.6  0.6  0.6  1.0  0.6  0.6  0.6  0.6   0.6
##  [6,]  0.6  0.6  0.6  0.6  0.6  1.0  0.6  0.6  0.6   0.6
##  [7,]  0.6  0.6  0.6  0.6  0.6  0.6  1.0  0.6  0.6   0.6
##  [8,]  0.6  0.6  0.6  0.6  0.6  0.6  0.6  1.0  0.6   0.6
##  [9,]  0.6  0.6  0.6  0.6  0.6  0.6  0.6  0.6  1.0   0.6
## [10,]  0.6  0.6  0.6  0.6  0.6  0.6  0.6  0.6  0.6   1.0
# Plot the time series
matplot(as.matrix(wide_df[,-ncol(wide_df)]), type = "l", xlab = "t", ylab = "Value", main = "100 Time Series - Multivariate Random Walks")

# Convert to long format using tidyr
### Pivot
long_df = wide_df %>%
  pivot_longer(cols = starts_with("x"),
               names_to = "level",
               values_to = "value")


# Print the long format data frame
print(long_df)
## # A tibble: 10,000 × 3
##        t level  value
##    <int> <chr>  <dbl>
##  1     1 x1    12.2  
##  2     1 x2    -0.704
##  3     1 x3     3.58 
##  4     1 x4    -2.02 
##  5     1 x5     1.72 
##  6     1 x6     6.88 
##  7     1 x7     6.08 
##  8     1 x8    -1.13 
##  9     1 x9    -7.21 
## 10     1 x10   -0.670
## # ℹ 9,990 more rows

Create linear basis functions

# Number of knots
n_knots = 10

# Define the piecewise linear function
piecewise_linear = function(x, k) {
  y = x - k
  y = ifelse(x < 0, 0, y)
  y = ifelse(x < k, 0, y)
  return(y)
}

# Define a function to create a linear spline basis
create_basis = function(x, k) {
  num_x = length(x)
  num_k = length(k)
  b = matrix(0, nrow = num_x, ncol = num_k)
  for (i in 1:num_k) {
    b_i = piecewise_linear(x, k[i])
    b[, i] = b_i
  }
  return(b)
}

# Example usage
x = seq(1, n_steps, by = 1)  # Sample x values
k_values = c(0:n_knots)*(n_steps/n_knots)   # Values of k
basis_matrix_wide = create_basis(x, k_values)
basis_matrix_wide = basis_matrix_wide/max(basis_matrix_wide)
colnames(basis_matrix_wide) = paste0("basis_X", c(1:11))
head(basis_matrix_wide)
##      basis_X1 basis_X2 basis_X3 basis_X4 basis_X5 basis_X6 basis_X7 basis_X8
## [1,]     0.01        0        0        0        0        0        0        0
## [2,]     0.02        0        0        0        0        0        0        0
## [3,]     0.03        0        0        0        0        0        0        0
## [4,]     0.04        0        0        0        0        0        0        0
## [5,]     0.05        0        0        0        0        0        0        0
## [6,]     0.06        0        0        0        0        0        0        0
##      basis_X9 basis_X10 basis_X11
## [1,]        0         0         0
## [2,]        0         0         0
## [3,]        0         0         0
## [4,]        0         0         0
## [5,]        0         0         0
## [6,]        0         0         0
plot(basis_matrix_wide[,1], type="lines", col="blue")
lines(basis_matrix_wide[,2], col="red")
lines(basis_matrix_wide[,3], col="green")
lines(basis_matrix_wide[,4], col="purple")
lines(basis_matrix_wide[,5], col="red")
lines(basis_matrix_wide[,6], col="green")
lines(basis_matrix_wide[,7], col="purple")
lines(basis_matrix_wide[,8], col="red")
lines(basis_matrix_wide[,9], col="green")
lines(basis_matrix_wide[,10], col="purple")

## Merge with the long format data

t = c(1:n_steps)
basis_matrix_wide = cbind(t, basis_matrix_wide)
basis_matrix_wide = as.data.frame(basis_matrix_wide)

## Merge the dataframes
long_df = long_df %>% left_join(basis_matrix_wide, by='t')
long_df
## # A tibble: 10,000 × 14
##        t level  value basis_X1 basis_X2 basis_X3 basis_X4 basis_X5 basis_X6
##    <dbl> <chr>  <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1     1 x1    12.2       0.01        0        0        0        0        0
##  2     1 x2    -0.704     0.01        0        0        0        0        0
##  3     1 x3     3.58      0.01        0        0        0        0        0
##  4     1 x4    -2.02      0.01        0        0        0        0        0
##  5     1 x5     1.72      0.01        0        0        0        0        0
##  6     1 x6     6.88      0.01        0        0        0        0        0
##  7     1 x7     6.08      0.01        0        0        0        0        0
##  8     1 x8    -1.13      0.01        0        0        0        0        0
##  9     1 x9    -7.21      0.01        0        0        0        0        0
## 10     1 x10   -0.670     0.01        0        0        0        0        0
## # ℹ 9,990 more rows
## # ℹ 5 more variables: basis_X7 <dbl>, basis_X8 <dbl>, basis_X9 <dbl>,
## #   basis_X10 <dbl>, basis_X11 <dbl>

Drop 97 data points from a single series

# Now try filtering out all but a handful of data points from specific series and then retrain the model, predict again.
# Keep only 3 data points at x10 and retrain
drop = which(long_df$level == "x10")
drop = drop[-c(5, 25, 91)]

sample_df = long_df[-drop,]
create_formula = function(num_basis, target = "value"){
  
  f = paste0(target, "  ~ (1 | level) + ")
  for (i in 1:num_basis){
    f = paste0(f, "(basis_X",i, " - 1 | level) + basis_X",i)
    if (i < num_basis){
      f = paste0(f, " + ")
    }
  }
  f = as.formula(f)
  return(f)
}
f = create_formula(10)
lm_fit = lmer(f, data=sample_df)

y_p_train = predict(lm_fit)
sample_df$y_p = y_p_train
for (x in unique(sample_df$level)){
  x1 = sample_df %>% filter(level == x)
  plot(x1$value)
  lines(x1$y_p, col="red")
}

# out of sample
y_p_test = predict(lm_fit, newdata=long_df)
long_df$y_p = y_p_test
x1 = long_df %>% filter(level == 'x10')
  print(x)
## [1] "x10"
  plot(x1$value)
  lines(x1$y_p, col="red")

rf = ranef(lm_fit)
fe = fixef(lm_fit)
hist(rf$level$basis_X1)

fe
## (Intercept)    basis_X1    basis_X2    basis_X3    basis_X4    basis_X5 
##   0.3875783 -22.8253528  22.2025734  37.7532642 -72.0846254  40.7230919 
##    basis_X6    basis_X7    basis_X8    basis_X9   basis_X10 
## -22.2477729  20.1480650  10.5205851 -36.6929338 -23.3739660
drop = which(long_df$level == "x10")
drop = drop[-c(5, 25,45, 60, 90)]

sample_df = long_df[-drop,]
f = create_formula(10)
lm_fit = lmer(f, data=sample_df)

y_p_train = predict(lm_fit)
sample_df$y_p = y_p_train
y_p_test = predict(lm_fit, newdata=long_df)
long_df$y_p = y_p_test
x1 = long_df %>% filter(level == 'x10')
  print(x)
## [1] "x10"
  plot(x1$value)
  lines(x1$y_p, col="red")

drop = which(long_df$level == "x10")
drop = drop[-c(5, 25,45, 60,84, 90)]

sample_df = long_df[-drop,]
f = create_formula(10)
lm_fit = lmer(f, data=sample_df)

y_p_train = predict(lm_fit)
sample_df$y_p = y_p_train

y_p_test = predict(lm_fit, newdata=long_df)
long_df$y_p = y_p_test

x1 = long_df %>% filter(level == 'x10')
  print(x)
## [1] "x10"
  plot(x1$value)
  lines(x1$y_p, col="red")